This is the file where we show all of the graphs and analyses we've performed on our datasets.
Here, we load all of our data and modules that we're going to use.
import pandas as pd
import numpy as np
from ast import literal_eval
import matplotlib.pyplot as plt
import igraph as ig
PATH_DATA = "data/"
save_frame = False
accounts_data = pd.read_csv(PATH_DATA + "instagram_accounts.csv", converters={'id_followers': literal_eval, 'department': literal_eval})
posts_data1 = pd.read_csv(PATH_DATA + "instagram_posts.csv")
posts_data2 = pd.read_csv(PATH_DATA + "instagram_posts_1211_1611.csv")
posts_data = pd.concat([posts_data1, posts_data2],axis=0)
# Checking if our duplicate user changes anything ...
posts_data = posts_data[posts_data['id_user'] != 603282]
posts_data.reset_index(drop=True, inplace=True)
Here, we check for duplicates, empty values etc. in our Accounts and Posts dataframes. We do end up finding one duplicate user in Accounts so we need to remove him/her, remove him/her from all followers list and recompute the numbers.
Posts data, on the other hand, was far nicer.
# Checking for empty cells across the data
accounts_data.isnull().values.any()
False
# Checking for duplicates - and we found one!
print(accounts_data.shape)
duplicated_user_id = accounts_data[ accounts_data['id_user'].duplicated() == True ]["id_user"].values
print(duplicated_user_id)
accounts_data.drop_duplicates(subset=['id_user'], inplace=True, keep=False)
accounts_data.reset_index(drop=True, inplace=True)
accounts_data.drop(columns=["Unnamed: 0"], inplace=True)
print(accounts_data.head())
for idx in range(len(accounts_data)):
lst = accounts_data.at[idx, "id_followers"]
for ele in duplicated_user_id:
if ele in lst:
lst.remove(ele)
accounts_data.at[idx, "id_followers"] = lst
accounts_data["nb_followers"] = accounts_data["id_followers"].apply(len)
(3047, 11)
[603282]
id_user nb_followers nb_following nb_posts sex \
0 288877 167 58 48 female
1 140311 67 72 8 female
2 182096 142 95 14 female
3 208875 249 99 150 male
4 960092 96 114 19 female
id_followers \
0 [738818, 134147, 314454, 977416, 926730, 82740...
1 [380289, 341188, 775558, 998151, 246792, 17869...
2 [524806, 968200, 241324, 233490, 188948, 15054...
3 [776192, 164353, 989698, 134147, 305670, 65792...
4 [858624, 896013, 138779, 817185, 854563, 88580...
department email \
0 (64, Pyrénées-Atlantiques) zacharieweber@live.com
1 (03, Allier) hugues65@fontaine.com
2 (27, Eure) gilles11@live.com
3 (22, Côtes-d'Armor) manoncolin@gmail.com
4 (74, Haute-Savoie) valerie79@guichard.com
user_agent birth_date
0 Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_10_... 06/22/82
1 Opera/9.55.(X11; Linux i686; wo-SN) Presto/2.9... 01/06/96
2 Mozilla/5.0 (Windows; U; Windows NT 4.0) Apple... 07/22/61
3 Mozilla/5.0 (Windows NT 4.0; nds-NL; rv:1.9.1.... 03/14/75
4 Mozilla/5.0 (Android 3.2.1; Mobile; rv:12.0) G... 07/26/67
dict_following = {key: 0 for key in accounts_data["id_user"].values}
for idx in range(len(accounts_data)):
lst = accounts_data.at[idx, "id_followers"]
for ele in lst:
dict_following[ele] += 1
accounts_data.set_index("id_user", inplace = True)
for key , value in dict_following.items():
accounts_data.at[key,"nb_following"] = value
accounts_data.reset_index(inplace=True)
accounts_data.head()
| id_user | nb_followers | nb_following | nb_posts | sex | id_followers | department | user_agent | birth_date | ||
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 288877 | 167 | 161 | 48 | female | [738818, 134147, 314454, 977416, 926730, 82740... | (64, Pyrénées-Atlantiques) | zacharieweber@live.com | Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_10_... | 06/22/82 |
| 1 | 140311 | 67 | 150 | 8 | female | [380289, 341188, 775558, 998151, 246792, 17869... | (03, Allier) | hugues65@fontaine.com | Opera/9.55.(X11; Linux i686; wo-SN) Presto/2.9... | 01/06/96 |
| 2 | 182096 | 142 | 161 | 14 | female | [524806, 968200, 241324, 233490, 188948, 15054... | (27, Eure) | gilles11@live.com | Mozilla/5.0 (Windows; U; Windows NT 4.0) Apple... | 07/22/61 |
| 3 | 208875 | 249 | 164 | 150 | male | [776192, 164353, 989698, 134147, 305670, 65792... | (22, Côtes-d'Armor) | manoncolin@gmail.com | Mozilla/5.0 (Windows NT 4.0; nds-NL; rv:1.9.1.... | 03/14/75 |
| 4 | 960092 | 95 | 162 | 19 | female | [858624, 896013, 138779, 817185, 854563, 88580... | (74, Haute-Savoie) | valerie79@guichard.com | Mozilla/5.0 (Android 3.2.1; Mobile; rv:12.0) G... | 07/26/67 |
# No duplicates across Posts! :)
posts_data[ posts_data['id_user'].duplicated() == True ]
| Unnamed: 0 | id_user | id_post | date | time | half_day | views | reposts | likes | comments | id_post_origin | link_clicks | donation_tag | donation_val | house_buy |
|---|
# How our duplicate user fares across posts:
# He's a terminal node on the graph, so there's little to no effect from him.
posts_data[posts_data["id_user"] == 603282]
| Unnamed: 0 | id_user | id_post | date | time | half_day | views | reposts | likes | comments | id_post_origin | link_clicks | donation_tag | donation_val | house_buy |
|---|
Exporting our data
if save_frame == True:
posts_data.to_csv('data/new/instagram_posts.csv')
accounts_data.to_csv('data/new/instagram_accounts.csv')
In this section, we can take a look at statistical properties of our data.
## Sex Data
accounts_data['sex'].value_counts()
female 1550 male 1495 Name: sex, dtype: int64
accounts_data['department'].value_counts().sort_values(ascending=False)
(55, Meuse) 44
(972, Martinique) 42
(85, Vendée) 40
(31, Haute-Garonne) 39
(27, Eure) 39
..
(52, Haute-Marne) 21
(2A, Corse-du-Sud) 20
(36, Indre) 20
(02, Aisne) 19
(04, Alpes-de-Haute-Provence) 18
Name: department, Length: 101, dtype: int64
accounts_data["nb_followers"].hist()
plt.show()
accounts_data["nb_followers"].describe()
count 3045.000000 mean 160.211166 std 51.991272 min 60.000000 25% 115.000000 50% 159.000000 75% 205.000000 max 255.000000 Name: nb_followers, dtype: float64
accounts_data["nb_following"].hist()
plt.show()
accounts_data["nb_following"].describe()
count 3045.000000 mean 160.211166 std 12.330283 min 115.000000 25% 152.000000 50% 160.000000 75% 169.000000 max 199.000000 Name: nb_following, dtype: float64
age = pd.to_datetime(accounts_data["birth_date"])
age = (pd.to_datetime("today") - age) / np.timedelta64(1, "Y")
age[age < 0] += 100
accounts_data["age"] = age.astype('int32')
print(accounts_data["age"].describe())
accounts_data["age"].hist(histtype="bar", ec="black")
plt.title("Ages of our users")
plt.ylabel("# of users")
plt.xlabel("Binned ages")
plt.show()
count 3045.000000 mean 38.930378 std 12.515001 min 18.000000 25% 28.000000 50% 39.000000 75% 50.000000 max 60.000000 Name: age, dtype: float64
## House_Buy Data
posts_data['house_buy'].value_counts()
False 3045 Name: house_buy, dtype: int64
posts_data["views"].hist()
posts_data["views"].describe()
count 3045.000000 mean 82.377997 std 62.351263 min 0.000000 25% 34.000000 50% 68.000000 75% 120.000000 max 387.000000 Name: views, dtype: float64
posts_data['link_clicks'].value_counts()
False 2726 True 319 Name: link_clicks, dtype: int64
posts_data["id_post_origin"].value_counts(sort=True, ascending=False)
638779430 18
144265898 14
760062606 14
844989061 12
650889385 11
..
761534124 1
249527892 1
295538073 1
741748710 1
465290461 1
Name: id_post_origin, Length: 1449, dtype: int64
posts_data[["id_post","reposts"]].sort_values(by="reposts", ascending=False)
| id_post | reposts | |
|---|---|---|
| 0 | 638779430 | 18 |
| 3 | 144265898 | 14 |
| 23 | 760062606 | 14 |
| 20 | 844989061 | 12 |
| 15 | 972167608 | 11 |
| ... | ... | ... |
| 1746 | 722483778 | 0 |
| 1744 | 543005466 | 0 |
| 1743 | 895528849 | 0 |
| 1742 | 831459158 | 0 |
| 3044 | 225756005 | 0 |
3045 rows × 2 columns
posts_data.head()
| Unnamed: 0 | id_user | id_post | date | time | half_day | views | reposts | likes | comments | id_post_origin | link_clicks | donation_tag | donation_val | house_buy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 672702 | 638779430 | 09/11/2017 | 07:53 | am | 2 | 18 | 0 | 0 | 0 | True | False | 0 | False |
| 1 | 1 | 474227 | 953043456 | 09/11/2017 | 02:03 | am | 36 | 11 | 16 | 1 | 0 | True | False | 0 | False |
| 2 | 2 | 587566 | 650889385 | 09/11/2017 | 02:57 | am | 59 | 11 | 8 | 1 | 0 | True | False | 0 | False |
| 3 | 3 | 483543 | 144265898 | 09/11/2017 | 11:43 | am | 21 | 14 | 9 | 2 | 0 | True | False | 0 | False |
| 4 | 4 | 394103 | 955542283 | 09/11/2017 | 04:53 | pm | 16 | 3 | 2 | 0 | 638779430 | False | False | 0 | False |
# Replacing idiotic timestamps
posts_data['time'] = posts_data['time'].apply(lambda str: str.replace("00:", "12:"))
posts_data['time'] = posts_data['time'].apply(lambda str: str.replace("13:", "01:"))
# Getting the hourly distribution of posts.
posts_data['time24'] = posts_data['time'] + " " + posts_data['half_day'].apply(lambda str: str.upper())
posts_data['time24'] = pd.to_datetime(posts_data['time24'], format="%I:%M %p")
posts_data['time24'].dt.hour.hist(bins=24, rwidth=0.5)
plt.xticks(rotation=90)
(array([-5., 0., 5., 10., 15., 20., 25.]), [Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, '')])
This section constructs the graphs of Accounts and Posts that we'll use in the metrics to follow.
mappingFrNodeToUserId = dict(zip(range(len(accounts_data)), accounts_data['id_user']))
mappingFrUserIdToNode = {v: k for k,v in mappingFrNodeToUserId.items()}
dict_followers = dict( zip(accounts_data['id_user'], accounts_data['id_followers']) )
edges=[(mappingFrUserIdToNode[node_i], node_j) for node_i in dict_followers.keys() for node_j in list(map(lambda x: mappingFrUserIdToNode[x], dict_followers[node_i]))]
# print(edges)
accounts = ig.Graph(edges=edges, directed=True)
accounts.vs["size"] = 1
accounts.layout_lgl()
# igraph.plot(accounts)
<Layout with 3045 vertices and 2 dimensions>
acc_diameters = accounts.get_diameter()
print(acc_diameters)
[1, 5, 25, 675]
Conclusion: Our user have broad connections, not deep ones.
Here we construct the graph for the posts. Each node is identified by its index in the dataframe, and then, its attributes are given as node attributes. All edges are added based on this.
def construct_posts_graph(graph_data):
graph = ig.Graph(directed=True);
graph.add_vertices(graph_data.index.values)
for idx in graph_data.index:
### if id_post_origin is zero, append it to its own list
# These are the original posts.
id_origin, id_post, id_user = graph_data.iloc[idx][['id_post_origin', 'id_post','id_user']]
# Add the vertex properties first
graph.vs[idx]['id_post'] = id_post
graph.vs[idx]['id_post_origin'] = id_origin
graph.vs[idx]['id_user'] = id_user
# Now, if it's not an origin post,
# we can add the corresponding edge.
if id_origin != 0 :
# For this, we take the id_origin and
# get the index of that post. Then,
# we can connect those two nodes :)
orig_index = graph_data.index[graph_data['id_post'] == id_origin].values[0]
graph.add_edges([ (orig_index, idx) ])
graph.vs['size']=7
graph.vs['arrow_size']=1
graph.vs['arrow_width']=1
return graph
posts = construct_posts_graph(posts_data)
ig.plot(posts)
# Finding out the components of our posts-graph
post_components = posts.clusters(mode='weak')
print(len(post_components))
# and getting the users attributed for each.
4
Let's compute our KPI's!
Here, we implement the first metric proposed in the first deliverable. This measures the number of likes, clicks, reposts, donations etc. that each post has. We can use this to create a composite ranking, which then provides us a KPI to maximise.
We expect 4 components - one associated to each of the 4 original posts used to seed our userbase. And that's what we get ! Of course, we have weak components because we're working in a Directed Acyclic Graph, so we'll need to ignore the direction of our edges to find components.
def interactivity(graph, graph_data):
# We get the nodes representing the original posts
# i.e. have original post ID = 0.
original_posts = graph.vs(id_post_origin_eq=0)
# print('Vertex IDs of the Original Posts: ', original_posts[:]['name'])
components = dict( zip(original_posts[:]['name'], graph.clusters(mode='weak') ) )
# print("Length of each component: ", [len(c) for c in components.values()])
cumulative_interactions = {key : {} for key in original_posts[:]['name']}
for key in cumulative_interactions:
cumulative_interactions[key]['like'] = 0
cumulative_interactions[key]['comment'] = 0
cumulative_interactions[key]['repost'] = 0
cumulative_interactions[key]['clicks'] = 0
cumulative_interactions[key]['donations_tag_count'] = 0
cumulative_interactions[key]['donations_value'] = 0
# print(cumulative_interactions)
# print(components)
for comp_idx in components:
for node in components[comp_idx]:
# print(node)
row = graph_data.loc[node]
cumulative_interactions[comp_idx]['like'] += row['likes']#.values[0]
cumulative_interactions[comp_idx]['comment'] += row['comments']#.values[0]
cumulative_interactions[comp_idx]['repost'] += row['reposts']#.values[0]
cumulative_interactions[comp_idx]['clicks'] += row['link_clicks']#.values[0].astype(int)
cumulative_interactions[key]['donations_tag_count'] += row["donation_tag"]
cumulative_interactions[key]['donations_value'] += row['donation_val']
cumulative_interactions = pd.DataFrame(cumulative_interactions)
cumulative_interactions['Total'] = cumulative_interactions.sum(axis=1)
return cumulative_interactions
interactivity_df = interactivity(posts, posts_data)
print(interactivity_df.head())
def get_interactivity_score(interactivity_df):
coeff = {'like': 1, 'comment': 2, 'repost': 3, 'clicks': 4, 'donation_tag':5, "donations_value":0}
return sum(interactivity_df['Total'] * list(coeff.values()))
print("\nInteractivity Score: ", get_interactivity_score(interactivity_df))
def get_donation_value(interactivity_df):
return interactivity_df.at["donations_value", "Total"]
print("\nDonation Value: ", get_donation_value(interactivity_df))
0 1 2 3 Total like 23950 16903 12835 14262 67950 comment 4540 3082 2492 2594 12708 repost 1099 723 572 649 3043 clicks 109 76 61 73 319 donations_tag_count 0 0 0 31 31 Interactivity Score: 103926 Donation Value: 755
Here, we implement a metric proposed as part of the first deliverable, visibility. This measures of the total number of views that the campaign have.
def visibility(graph, graph_data):
original_posts = graph.vs(id_post_origin_eq=0)
components = dict( zip(original_posts[:]['name'], graph.clusters(mode='weak') ) )
cumulative_visibility = {key : {} for key in original_posts[:]['name']}
for key in cumulative_visibility:
cumulative_visibility[key]['views'] = 0
for comp_idx in components:
for node in components[comp_idx]:
row = graph_data.loc[node]
cumulative_visibility[comp_idx]['views'] += row['views']#.values[0]
cumulative_visibility = pd.DataFrame(cumulative_visibility)
cumulative_visibility['Total'] = cumulative_visibility.sum(axis=1)
return cumulative_visibility
print(visibility(posts, posts_data))
def get_total_visibility(visibility_df):
return visibility_df["Total"].values[0]
print("Total Visibility: ", get_total_visibility(visibility(posts, posts_data)))
0 1 2 3 Total views 90071 61455 46545 52770 250841 Total Visibility: 250841
Virality is the speed at which the campaign was propagated. The notion of speed, will be provided for by the diameter of the each connected components. The diameter is inversely proportional to the speed of the campaign as the diameter denotes how many degree of separation between the source node and the "furthest" node.
def get_component_diameters(posts):
subgraphs = posts.decompose(mode='weak')
diameters = [subgraph.diameter() for subgraph in subgraphs]
return diameters
print("Depth of each Original Post:", get_component_diameters(posts))
# diameter_paths = [subgraph.get_diameter() for subgraph in subgraphs]
# print("Actual Path taken by each Depth-y post: ", diameter_paths)
# Conclusion:
def get_max_diameter(components_diameters):
return max(components_diameters)
print("Longest Diameter: ", get_max_diameter(get_component_diameters(posts)))
Depth of each Original Post: [9, 8, 10, 8] Longest Diameter: 10
def node_properties(posts):
# We get the nodes representing the original posts
# i.e. have original post ID = 0.
original_posts = posts.vs(id_post_origin_eq=0)
original_posters = original_posts[:]['id_user']
print(original_posters)
# Now we evaluate our criteria
original_posters_nodes = [mappingFrUserIdToNode[x] for x in original_posters]
original_posters_closeness = np.array(accounts.closeness(original_posters_nodes, mode="out"))
original_posters_betweenness = np.array(accounts.betweenness(original_posters_nodes)) / ((accounts.vcount() - 1) * (accounts.vcount() - 2))
original_posters_pagerank = np.array(accounts.pagerank(original_posters_nodes))
# and we return them properly formatted
return pd.DataFrame.from_dict({
'nodes': original_posters_nodes,
'closeness': original_posters_closeness,
'betweenness': original_posters_betweenness,
'pagerank': original_posters_pagerank,
})
print("Node properties of our most important posts: ")
print(node_properties(posts))
Node properties of our most important posts: [672702, 474227, 587566, 483543] nodes closeness betweenness pagerank 0 849 0.510738 0.000232 0.000302 1 1363 0.511769 0.000257 0.000308 2 874 0.501979 0.000159 0.000345 3 51 0.509627 0.000272 0.000361
##Network's best K nodes
K = 4
def get_k_nodes_random (nodes = K):
accounts_random = range(len(accounts_data))
return np.random.choice(accounts_random, K, replace=False)
print(get_k_nodes_random(nodes=K))
def get_k_nodes_closeness(nodes = K):
accounts_closeness = np.array(accounts.closeness(mode="out"))
accounts_closeness_bestKnodes = np.argpartition(accounts_closeness,-K)[-K:]
return np.flip(accounts_closeness_bestKnodes)
print(get_k_nodes_closeness(nodes = K))
def get_k_nodes_betweenness(nodes = K):
accounts_betweenness = np.array(accounts.betweenness()) / ((accounts.vcount() - 1) * (accounts.vcount() - 2))
accounts_betweenness_bestKnodes = np.argpartition(accounts_betweenness,-K)[-K:]
return np.flip(accounts_betweenness_bestKnodes)
print(get_k_nodes_betweenness(nodes = K))
def get_k_nodes_pagerank(nodes= K):
accounts_pagerank = np.array(accounts.pagerank())
accounts_pagerank_bestKnodes = np.argpartition(accounts_pagerank,-K)[-K:]
return np.flip(accounts_pagerank_bestKnodes)
print(get_k_nodes_pagerank(nodes= K))
[1398 797 2377 1925] [ 664 734 1082 894] [2166 2003 2351 856] [2882 1367 1854 1288]
The probability that a user who has seen a post will click on the link to the site.
prob_click = posts_data['link_clicks'].sum()/len(posts_data)
print("Probability of clicking on a post: ", prob_click*100, "%")
Probability of clicking on a post: 10.476190476190476 %
Evaluated as the number of donors over the number of possible donors(number of site visitors). prob_donation gives the probability that someone donated given that they clicked on the link to the site.
donors= posts_data[posts_data["donation_tag"]]
print( "Number of users who went to the website: ", len(posts_data[posts_data["link_clicks"]]) )
print ("Number of donors:", len(donors[donors['donation_val']>0]))
prob_donation = len(donors[donors['donation_val']>0])/len(posts_data[posts_data["link_clicks"]])
print("Empirical probabilty of donation: ", prob_donation*100, "%")
Number of users who went to the website: 319 Number of donors: 31 Empirical probabilty of donation: 9.717868338557993 %
Here, we finally begin our simulations!
The merged_dataset is a tool for us to quickly access users and posts without excessive querying - nothing to worry about :)
The model starts right after this.
merged_dataset_all = posts_data.merge(accounts_data, on=["id_user"], how='left')
merged_dataset = pd.DataFrame(merged_dataset_all[["id_user", "views", "reposts", "likes", "comments", "link_clicks", "donation_tag", "donation_val", "nb_followers"]])
merged_dataset["percent_views"] = merged_dataset["views"]/merged_dataset["nb_followers"]
merged_dataset["percent_reposts"] = merged_dataset["reposts"]/merged_dataset["views"]
merged_dataset["percent_likes"] = merged_dataset["likes"]/merged_dataset["views"]
merged_dataset["percent_comments"] = merged_dataset["comments"]/merged_dataset["views"]
# merged_dataset[merged_dataset["percent_comments"]> 1]
merged_dataset.set_index(['id_user'],inplace=True)
merged_dataset.head()
| views | reposts | likes | comments | link_clicks | donation_tag | donation_val | nb_followers | percent_views | percent_reposts | percent_likes | percent_comments | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id_user | ||||||||||||
| 672702 | 2 | 18 | 0 | 0 | True | False | 0 | 131 | 0.015267 | 9.000000 | 0.000000 | 0.000000 |
| 474227 | 36 | 11 | 16 | 1 | True | False | 0 | 140 | 0.257143 | 0.305556 | 0.444444 | 0.027778 |
| 587566 | 59 | 11 | 8 | 1 | True | False | 0 | 77 | 0.766234 | 0.186441 | 0.135593 | 0.016949 |
| 483543 | 21 | 14 | 9 | 2 | True | False | 0 | 123 | 0.170732 | 0.666667 | 0.428571 | 0.095238 |
| 394103 | 16 | 3 | 2 | 0 | False | False | 0 | 93 | 0.172043 | 0.187500 | 0.125000 | 0.000000 |
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import SGDRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
mlmodel_data = merged_dataset_all[merged_dataset_all["donation_val"] > 0]
# X_variables_columns = ["nb_followers", "nb_following", "time24", "nb_posts", "sex", "department", "age"]
X_variables_columns = ["nb_followers", "nb_following", "time24", "nb_posts", "department", "age"]
X_variables = pd.DataFrame(mlmodel_data[X_variables_columns])
X_variables["time24"] = X_variables["time24"].apply(lambda x : x.hour)
X_variables["department"] = X_variables["department"].apply(lambda x : x[1])
target_column = ["donation_val"]
target = mlmodel_data[target_column]
target = target.to_numpy().reshape(len(target),)
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)
numerical_columns = numerical_columns_selector(X_variables)
categorical_columns = categorical_columns_selector(X_variables)
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = MinMaxScaler()
preprocessor = ColumnTransformer([
('one-hot-encoder', categorical_preprocessor, categorical_columns),
('minmax_scaler', numerical_preprocessor, numerical_columns)])
regr = SGDRegressor(max_iter=10000)
mlmodel = make_pipeline(preprocessor, regr)
_ = mlmodel.fit(X_variables, target)
def get_predicted_donation_value(id_user):
X_variables_columns = ["nb_followers", "nb_following", "time24", "nb_posts", "department", "age"]
X_var = merged_dataset_all[merged_dataset_all["id_user"] == id_user]
X_var = pd.DataFrame(X_var[X_variables_columns])
X_var["time24"] = X_var["time24"].apply(lambda x : x.hour)
X_var["department"] = X_var["department"].apply(lambda x : x[1])
donation_val = mlmodel.predict(X_var)
return max(0, int(donation_val))
Compartment Model under consideration.
Poster ---> Followers ---->
View ----> Reposts
View ----> Comment
View ----> Like
Poster ---> Link Click ---> Donation
The simulation essentially works as follows.
We achieve this using a simple Breadth First Traversal of the accounts graph, assuming that the best possible seeds have been chosen to start from. This choice of seeds effectively decides the strategy.
In case of a repost, we populate a simulated posts_data data called new_posts_data.
The number of reposts is given by the outdegree of the graph. This is because each outgoing edge in the Posts graph represents a poster->reposter link between posts.
rand = lambda n, p : np.random.uniform(0, 1, n) < p
## strategy: {"random", "closeness", "betweenness", "pagerank"}
def simulation(strategy, nodes):
strategies = {"random": get_k_nodes_random, "closeness": get_k_nodes_closeness, "betweenness": get_k_nodes_betweenness, "pagerank": get_k_nodes_pagerank}
# Select the initial seeds x
seeded_nodes = strategies[strategy](nodes=K)
# seeded_nodes = [0, 1, 2, 3] # seeds from previous campaign
new_posts_data = pd.DataFrame(columns=['id_user', 'id_post', 'views',
'reposts', 'likes', 'comments', 'id_post_origin', 'link_clicks',
'donation_tag', 'donation_val'])
# Seeding the table
counter=1
for node in seeded_nodes:
id_user = accounts_data.at[node,'id_user']
row = merged_dataset.loc[id_user]
data = accounts_data[accounts_data['id_user'] == node]
# Creating the entries
followers = np.array( data['id_followers'] )
new_views = followers[rand(followers.size, row['percent_views'])]
# And then filter the number of likes and comments it gets ....
new_likes = new_views[rand(new_views.size, row['percent_likes'])]
new_comments = new_views[rand(new_views.size, row['percent_comments'])]
# Finally the number of clicks and donors
new_click = np.random.random() < prob_click
new_donor = False
donation_value = 0
if new_click:
# Probability of donating
new_donor = np.random.random() < prob_donation
if new_donor:
donation_value = get_predicted_donation_value(id_user)
if donation_value <= 0:
new_donor = False
# Entering the entries
new_posts_data = new_posts_data.append({
'id_user': id_user,
'id_post': counter,
'id_post_origin': 0,
'views': len(new_views),
'likes': len(new_likes),
'comments': len(new_comments),
'reposts': 0, # A posteriori
'link_clicks': new_click,
'donation_tag': new_donor,
'donation_val': donation_value,
}, ignore_index=True)
counter += 1
# Initialisation of our long walk
frontier = [accounts_data.at[node,'id_user'] for node in seeded_nodes] ## To begin with seeded nodes
seen_list = { key : False for key in accounts_data['id_user'] }
for node in frontier:
seen_list[node] = True
# The long walk ...
while len(frontier) > 0:
node = frontier.pop(0)
data = accounts_data[accounts_data['id_user'] == node]
row = merged_dataset.loc[node]
followers = np.array( data['id_followers'].values[0] )
# Get the number of viewers
new_views = followers[rand(followers.size, row['percent_views'])]
# And then filter the number of likes and comments it gets ....
new_likes = new_views[rand(new_views.size, row['percent_likes'])]
new_comments = new_views[rand(new_views.size, row['percent_comments'])]
# Finally the number of clicks and donors
new_click = np.random.random() < prob_click
new_donor = False
donation_value = 0
if new_click:
# Probability of donating
new_donor = np.random.random() < prob_donation
if new_donor:
donation_value = get_predicted_donation_value(id_user)
if donation_value <= 0:
new_donor = False
# And the only ones who'll get entries into the table - the reposters.
percent_reposts = row['percent_reposts'] if row['percent_reposts'] < 1 else 1
new_reposts = followers[rand(followers.size, percent_reposts)] #
new_reposts = np.array( [follower for follower in new_views if seen_list[follower] == False] )
# print(new_views)
# Only reposts get into the new_posts hall of fame ...
for nbor in new_reposts:
if seen_list[nbor] == False:
id_user = nbor #accounts_data[accounts_data['id_user'] == node]
row = merged_dataset.loc[id_user]
# print(new_posts_data)
new_posts_data = new_posts_data.append({
'id_user': id_user,
'id_post': counter,
'id_post_origin': new_posts_data.loc[new_posts_data['id_user']==node]['id_post'].values[0],
'views': len(new_views),
'likes': len(new_likes),
'comments': len(new_comments),
'reposts': 0,
'link_clicks': new_click,
'donation_tag': new_donor,
'donation_val': donation_value,
}, ignore_index=True)
counter += 1
seen_list[nbor] = True
# if repost_list[nbor] == False:
# repost_list[nbor] = True
frontier.append(nbor)
new_posts_data.reset_index(drop=True, inplace=True)
new_posts_data.set_index('id_user')
new_posts = construct_posts_graph(new_posts_data)
new_posts_data['reposts'] = pd.Series( new_posts.degree(mode='out') )
return new_posts_data, new_posts
Let's see how well we did, as compared to the original campaign.
new_posts_data, new_posts = simulation(strategy="closeness", nodes=K)
ig.plot(new_posts)
new_posts_data.head()
| id_user | id_post | views | reposts | likes | comments | id_post_origin | link_clicks | donation_tag | donation_val | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 196644 | 1 | 0 | 57 | 0 | 0 | 0 | False | False | 0 |
| 1 | 676277 | 2 | 0 | 13 | 0 | 0 | 0 | False | False | 0 |
| 2 | 919964 | 3 | 0 | 141 | 0 | 0 | 0 | False | False | 0 |
| 3 | 523927 | 4 | 0 | 49 | 0 | 0 | 0 | True | False | 0 |
| 4 | 565270 | 5 | 57 | 44 | 2 | 1 | 1 | False | False | 0 |
visibility(new_posts, new_posts_data)
| 0 | 1 | 2 | 3 | Total | |
|---|---|---|---|---|---|
| views | 192548 | 36592 | 71640 | 4423 | 305203 |
new_components = new_posts.clusters(mode='weak')
print("Number of components in the NewPosts Graph:", len(new_components))
Number of components in the NewPosts Graph: 4
interactivity(new_posts, new_posts_data)
| 0 | 1 | 2 | 3 | Total | |
|---|---|---|---|---|---|
| like | 58714 | 9601 | 25606 | 1429 | 95350 |
| comment | 10952 | 2106 | 3644 | 161 | 16863 |
| repost | 2052 | 288 | 639 | 62 | 3041 |
| clicks | 141 | 94 | 30 | 4 | 269 |
| donations_tag_count | 0 | 0 | 0 | 0 | 0 |
| donations_value | 0 | 0 | 0 | 0 | 0 |
print("The diameters of each component are: ", get_component_diameters(new_posts) )
print('Plotting the smallest subgraph for visual understanding ... ')
subgr = new_posts.decompose(mode='weak')
ig.plot(subgr[1])
The diameters of each component are: [3, 2, 2, 2] Plotting the smallest subgraph for visual understanding ...
NUM_OF_SIMULATIONS = 50
strategy = "betweenness"
new_posts_data_and_graph_monte_carlo = []
for i in range (NUM_OF_SIMULATIONS):
simu_posts_data, simu_posts = simulation(strategy=strategy, nodes=K)
new_posts_data_and_graph_monte_carlo.append((simu_posts_data,simu_posts))
visibility_monte_carlo = list(map(lambda x: visibility(x[1],x[0]) ,new_posts_data_and_graph_monte_carlo))
# print(visibility_monte_carlo)
average_visibility = sum(visibility_monte_carlo)/len(visibility_monte_carlo)
print(average_visibility)
0 1 2 3 Total views 0.0 188910.66 122280.64 13380.54 324571.84
interactivity_monte_carlo = list(map(lambda x: interactivity(x[1],x[0]) ,new_posts_data_and_graph_monte_carlo))
# print(interactivity_monte_carlo)
average_interactivity = sum(interactivity_monte_carlo)/len(interactivity_monte_carlo)
print(average_interactivity)
average_interactivity_score = get_interactivity_score(average_interactivity)
print("\nAverage Interactivity Score: ", average_interactivity_score)
average_donation_value = get_donation_value(average_interactivity)
print("\nAverage Donation Value: ", average_donation_value)
0 1 2 3 Total like 0.00 52519.78 34341.60 3543.50 90404.88 comment 0.00 10051.50 6531.22 686.72 17269.44 repost 0.00 1811.26 1085.44 144.30 3041.00 clicks 0.12 188.28 114.08 17.06 319.54 donations_tag_count 0.00 0.00 0.00 32.16 32.16 donations_value 0.00 0.00 0.00 571.88 571.88 Average Interactivity Score: 135505.72 Average Donation Value: 571.88
virality_monte_carlo = list(map(lambda x: get_component_diameters(x[1]), new_posts_data_and_graph_monte_carlo))
average_virality = np.array(virality_monte_carlo).mean(axis=0)
print(average_virality)
print("Longest Diameter: ", get_max_diameter(average_virality))
[0. 3. 2. 2.] Longest Diameter: 3.0
simulation_runs = list(range(1,NUM_OF_SIMULATIONS + 1))
visibility_data = list(map(get_total_visibility, visibility_monte_carlo))
interactivity_score_data = list(map(get_interactivity_score, interactivity_monte_carlo))
donation_value_data = list(map(get_donation_value, interactivity_monte_carlo))
virality_data = list(map(get_max_diameter, virality_monte_carlo))
simulation_df_inputs_dict = {"runs": simulation_runs, "visibility": visibility_data, "interactivity score": interactivity_score_data, "donation_value": donation_value_data, "diameter": virality_data}
simulation_df = pd.DataFrame(simulation_df_inputs_dict)
simulation_df.set_index(["runs"], inplace=True)
simulation_df.to_csv("./simulation_data/{}{}.csv".format(strategy, NUM_OF_SIMULATIONS))
simulation_df.head()
| visibility | interactivity score | donation_value | diameter | |
|---|---|---|---|---|
| runs | ||||
| 1 | 283479 | 124748 | 1596 | 3 |
| 2 | 326544 | 131341 | 176 | 3 |
| 3 | 315088 | 124125 | 213 | 3 |
| 4 | 314891 | 130745 | 77 | 3 |
| 5 | 312529 | 125561 | 248 | 3 |